# you may need to install the packages
# install.packages("stringr")
# install.packages("plotly")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
dat <- read.csv('mobile-food-sf.csv', stringsAsFactors = FALSE)
day_freqs <- table(dat$DayOfWeekStr)
barplot(day_freqs, border = NA, las = 3)
plot_ly(x = names(day_freqs),
y = day_freqs,
type = 'bar')
# day frequencies table
day_counts <- dat %>%
select(DayOfWeekStr) %>%
group_by(DayOfWeekStr) %>%
summarise(count = n()) %>%
arrange(desc(count))
day_counts
## # A tibble: 7 × 2
## DayOfWeekStr count
## <chr> <int>
## 1 Friday 1105
## 2 Wednesday 1095
## 3 Thursday 1090
## 4 Tuesday 1081
## 5 Monday 1080
## 6 Saturday 533
## 7 Sunday 263
plot_ly(day_counts,
x = ~DayOfWeekStr,
y = ~count,
type = 'bar')
plot_ly(day_counts,
x = ~reorder(DayOfWeekStr, count),
y = ~count,
type = 'bar')
Your turn: What about times where the hour has just one digit? For example: 9AM, or 8AM? Create the following vector times and try to subset the hour and the periods with str_sub()
times <- c('12PM', '10AM', '9AM', '8AM', '2PM')
# subset time
str_sub(times,end=-3)
## [1] "12" "10" "9" "8" "2"
# subset period
str_sub(times,start = -2)
## [1] "PM" "AM" "AM" "AM" "PM"
#
str_replace(times, pattern = 'AM|PM', replacement = '')
## [1] "12" "10" "9" "8" "2"
#Using times, create a numeric vector hours containing just the number time (i.e. hour)
hours = str_sub(dat$starttime,end=-3)
#Using times, create a character vector periods containing the period, e.g. AM or PM
periods = str_sub(dat$starttime,start = -2)
#Use plot_ly() to make a barchart of the counts for AM and PM values.
time_counts = data.frame(table(periods))
colnames(time_counts)[2] = "counts"
plot_ly(time_counts,
x= ~periods,
y= ~counts,
type = "bar")
# Write R code to create a vector start24 that contains the hour in 24hr scale.
tep = data.frame("hours" = as.integer(hours),periods,stringsAsFactors = FALSE)
tep["start24"] = tep$hours
# Yeah, you need to repeat it in R. Like a = a + 2
tep$start24[periods == "PM"] = tep$start24[periods == "PM"]+12
tep["start24"] = paste0(as.character(tep$start24),":","00")
start24 = tep$start24
rm(tep)
#Add two columns start and end to the data frame dat, containing the starting and ending hour respectively (columns must be "numeric").
dat["start"] = as.numeric(hours)
dat["period"] = periods
dat$start[dat$period == "PM"] = dat$start[dat$period == "PM"] + 12
dat$start[dat$starttime=="12PM"] = 12
dat$start[dat$starttime=="12AM"] = 0
# remove period column
dat$period = NULL
# Do same thing with end
end_hours = str_sub(dat$endtime,end = -3)
end_period = str_sub(dat$endtime,start = -2)
dat["end"] = as.numeric(end_hours)
dat["period"] = end_period
dat$end[dat$period == "PM"] = dat$end[dat$period == "PM"] + 12
dat$end[dat$endtime=="12PM"] = 12
dat$end[dat$endtime=="12AM"] = 0
dat$period = NULL
#With the starting and ending hours, calculate the duration, and add one more column duration to the data frame dat:
dat$end[dat$end < dat$start] = dat$end[dat$end < dat$start] + 24
dat["duration"] = dat$end - dat$start
dat$end[dat$end >= 25] = dat$end[dat$end >= 25] - 24
Latitude and Longitude Coordinates
loc1 <- "(37.7651967350509,-122.416451692902)"
# "remove" opening parenthesis
str_replace(loc1, pattern = '\\(', replacement = '')
## [1] "37.7651967350509,-122.416451692902)"
## [1] "37.7651967350509,-122.416451692902)"
# "remove" closing parenthesis
str_replace(loc1, pattern = '\\)', replacement = '')
## [1] "(37.7651967350509,-122.416451692902"
## [1] "(37.7651967350509,-122.416451692902"
str_replace(loc1, pattern = '\\(|\\)', replacement = '')
## [1] "37.7651967350509,-122.416451692902)"
str_replace_all(loc1, pattern = '\\(|\\)', replacement = '')
## [1] "37.7651967350509,-122.416451692902"
lat_lon <- str_replace_all(loc1, pattern = '\\(|\\)', replacement = '')
str_replace(lat_lon, pattern = ',', replacement = '')
## [1] "37.7651967350509-122.416451692902"
# string split in stringr
str_split(lat_lon, pattern = ',')
## [[1]]
## [1] "37.7651967350509" "-122.416451692902"
# You have to put them together
lat_lon = str_split(str_replace_all(dat$Location,pattern = '\\(|\\)', replacement = ''),pattern = ",")
lat <- lapply(lat_lon, function(x) x[1])
lon = lapply(lat_lon, function(x) x[2])
lat = as.numeric(unlist(lat))
lon = as.numeric(unlist(lon))
dat["lat"] = lat
dat["lon"] = lon
Plotting locations on a map
plot(dat$lon, dat$lat, pch = 19, col = "#77777744")
# default scatterplot
plot_ly(x = lon, y = lat)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 40 observations
# default scatterplot
plot_ly(x = lon, y = lat, type = 'scatter', mode = 'markers')
## Warning: Ignoring 40 observations
plot_ly(data = dat, x = ~lon, y = ~lat, type = 'scatter', mode = 'markers')
## Warning: Ignoring 40 observations
library(RgoogleMaps)
# coordinates for center of the map
center <- c(mean(dat$lat, na.rm = TRUE), mean(dat$lon, na.rm = TRUE))
# zoom value
zoom <- min(MaxZoom(range(dat$lat, na.rm = TRUE),
range(dat$lon, na.rm = TRUE)))
# san francisco map
map1 <- GetMap(center=center, zoom=zoom, destfile = "san-francisco.png")
PlotOnStaticMap(map1, dat$lat, dat$lon, col = "#ed4964", pch=20)
library(ggmap)
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
dat <- na.omit(dat)
# ggmap typically asks you for a zoom level,
# but we can try using ggmap's make_bbox function:
sbbox <- make_bbox(lon = dat$lon, lat = dat$lat, f = .1)
sbbox
## left bottom right top
## -122.48867 37.69985 -122.36281 37.81595
# get a 'terrain' map
sf_map <- get_map(location = sbbox, maptype = "terrain", source = "google")
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.757897,-122.425744&zoom=13&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(sf_map) +
geom_point(data = dat,
mapping = aes(x = lon, y = lat),
color = "red", alpha = 0.2, size = 1)
## Warning: Removed 98 rows containing missing values (geom_point).
dat$optionaltext[1:3]
## [1] "Tacos, Burritos, Tortas, Quesadillas, Mexican Drinks, Aguas Frescas"
## [2] "Cold Truck: sandwiches, drinks, snacks, candy, hot coffee"
## [3] "Cold Truck: Pre-packaged Sandwiches, Various Beverages, Salads, Snacks"
foods <- dat$optionaltext[1:10]
str_detect(foods,pattern = "Burritos|burritos")
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
grepl("Burritos|burritos",foods,ignore.case = TRUE)
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#Match both
str_detect(foods,pattern = "tacos|quesadillas")
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
grepl("tacos|quesadillas",foods,ignore.case = TRUE)
## [1] TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
#subsetting
burritos = dat[str_detect(dat$optionaltext,pattern = "Burritos|burritos"),]
burritos2 = filter(dat,str_detect(dat$optionaltext,pattern = "Burritos|burritos"))
burritos3 = dat %>%
filter(str_detect(optionaltext,pattern = "Burritos|burritos"))
#burritos map
# ggmap typically asks you for a zoom level,
# but we can try using ggmap's make_bbox function:
sbbox <- make_bbox(lon = burritos$lon, lat = burritos$lat, f = .1)
sbbox
## left bottom right top
## -122.48776 37.70077 -122.37289 37.80573
# get a 'terrain' map
sf_map <- get_map(location = sbbox, maptype = "terrain", source = "google")
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.753253,-122.430325&zoom=13&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(sf_map) +
geom_point(data = burritos,
mapping = aes(x = lon, y = lat),
color = "red", alpha = 0.2, size = 1,na.rm = TRUE)
burritos["type"] = "burittos"
quesadillas = dat[str_detect(dat$optionaltext,pattern="quesadillas|Quesadillas"),]
quesadillas["type"]="quesadillas"
tacos = dat[grepl("quesadillas",dat$optionaltext,ignore.case = TRUE),]
tacos["type"] = "tacos"
dat_food = rbind(burritos,quesadillas,tacos)
loc = make_bbox(lon=dat_food$lon,lat=dat_food$lat,f=0.1)
food_map = get_map(location = loc,maptype="terrain",source = "google")
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.756441,-122.430325&zoom=13&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(food_map) + geom_point(data=dat_food,aes(x=lon,y=lat),color="red",alpha=0.2,size=1,na.rm=TRUE) + facet_wrap(~ type)